---
title: "Benchmarking_Supplementary"
author: ""
date: ""
output: html_document
---

```{r setup, include=FALSE}

rm(list = ls())

library(tidyverse)
library(jsonlite)
library(psych)
library(irr)
library(readxl)
library(forcats)
library(patchwork)
library(broom)
library(caret)
library(quanteda)
library(quanteda.textstats)
library(irr)
library(gridExtra)
library(kableExtra)
library(stargazer)
library(koRpus)
library(devtools)
#install_github("unDocUMeantIt/koRpus.lang.es")
library(koRpus.lang.es)

```

# Intercoder Reliability Analysis

```{r}
#Load in data for each coder 
cd_cod = read_csv("Undergrad_codings_coder2 - Sheet1.csv") 

aw_cod = read_csv("Undergrad_codings_coder3 - Sheet1.csv") 

ar_cod = read_csv("Undergrad_codings_coder1 - Sheet1.csv") 

group_cod = read_csv("Group_ROCCA_codings - Sheet1.csv") 

tt = rbind(aw_cod, ar_cod, cd_cod, group_cod)

#Change data format so that there are only three columns...one for each coder...and that each row corresponds with a 1/0 code for a theme
tt_coders = tt %>% select(c(MAIN_HUM_HUMAN, MAIN_HUM_REFUGE, MAIN_THREAT_CULT, MAIN_THREAT_DIS,
                          MAIN_THREAT_ECON, MAIN_THREAT_VIOL, MAIN_THREAT_INSTAB, MAIN_BEN_CULT, 
                          MAIN_BEN_ECON, MAIN_PI, CODER, DOCID)) %>% 
  pivot_longer(cols = c(MAIN_HUM_HUMAN, MAIN_HUM_REFUGE, MAIN_THREAT_CULT, MAIN_THREAT_DIS,
                          MAIN_THREAT_ECON, MAIN_THREAT_VIOL, MAIN_THREAT_INSTAB, MAIN_BEN_CULT, 
                          MAIN_BEN_ECON, MAIN_PI)) %>% 
  mutate(lab4 = case_when(
    str_detect(name, "HUM") == TRUE ~ "human", 
    str_detect(name, "THREAT") == TRUE ~ "threat", 
    str_detect(name, "BEN") == TRUE ~ "ben", 
    str_detect(name, "PI") == TRUE ~ "pi"))

# Separate by coder
separate_coder <- function(coder) {
  tt_coders %>%
    filter(CODER == coder) %>%
    drop_na() %>%
    rename_with(~ str_replace(.x, "value", paste0("CODER_", coder))) %>%
    select(-CODER)
}

aw <- separate_coder("AW")
ar <- separate_coder("AR")
cd <- separate_coder("CD")
group <- separate_coder("all") %>% rename(theme = name)

# Rejoin on the same DOCIDs
coders <- list(ar, cd, aw) %>%
  reduce(full_join, by = c("DOCID", "name", "lab4")) %>%
  filter(DOCID > 40) %>% #Remove these because we practiced on them together
  relocate(name, .after = CODER_AW) %>%
  rename(theme = name)

coders = coders %>% full_join(group, by = c("DOCID", "theme", "lab4")) 

rm(list=setdiff(ls(), "coders"))

#This chunk looks at GENERAL reliability

coders <- coders %>% relocate(lab4, .after = theme) %>% 
  filter(!(theme %in% c("MAIN_BEN_CULT", "MAIN_THREAT_CULT"))) #Remove cultural benefit and cultural threat labels

coders_4lab <- coders %>%
  group_by(DOCID, lab4) %>%
  summarize(across(starts_with("CODER"), max)) %>%
  relocate(lab4, .after = CODER_AW)

# Cohen's kappa and ICC for raters
compute_reliability <- function(data, cols) {
  kappa2(data[, cols], "unweighted")
  icc(data[, cols])
}

reliability_results <- list(
  coders12 = compute_reliability(coders, c(4, 3)),
  coders13 = compute_reliability(coders, c(4, 2)),
  coders23 = compute_reliability(coders, c(2, 3))
)

reliability_4lab_results <- list(
  coders12 = compute_reliability(coders_4lab, c(4, 3)),
  coders13 = compute_reliability(coders_4lab, c(4, 2)),
  coders23 = compute_reliability(coders_4lab, c(2, 3))
)
```


```{r disagreements at entire article level}
#Human data
human_10lab = read_csv("human10_agree.csv")
human_4lab = read_csv("human4_agree.csv")

#Calc disagreements between humans

#8 label
table(human_10lab$code_agreement) #87/590 = 15% disagreement

#4 label
table(human_4lab$code_agreement) #77/590 = 13% disagreement

rm(list = ls())
```

# Calculate differences across earlier and later prompts

```{r create functions}
# Function to read and process AI labels
read_and_process <- function(file, date, prompt) {
  read_csv(file) %>%
    mutate(date_coded = date, prompt = prompt) %>%
    select(-...1)
}

# Read and process human labels, filtered by GPT docids
read_and_process_human <- function(file, gpt_docids) {
  read_csv(file) %>%
    select(-c(...1, text)) %>%
    mutate(date_coded = "NA", coder = "human") %>%
    filter(docid %in% gpt_docids)
}

#Calc hamming loss
calculate_hamming_loss <- function(human_mat, model_mat) {
  if (!is.matrix(human_mat) || !is.matrix(model_mat)) {
    stop("Both objects should be matrices.")
  }
  if (nrow(human_mat) != nrow(model_mat) || ncol(human_mat) != ncol(model_mat)) {
    stop("Dimensions don't match.")
  }
  sapply(1:ncol(human_mat), function(i) mean(human_mat[, i] != model_mat[, i]))
}

#Calc proportion 1s
get_means <- function(data, prompt) {
  values <- apply(data, 2, mean, na.rm = TRUE)
  tibble(mean_prop_1s = values, theme = names(values), prompt = prompt)
}

# Function to write JSONL directly from a list of lists
write_jsonl <- function(data_list, file_name) {
  # Open a connection to the file
  con <- file(file_name, open = "wt")
  
  # Write each element of the list as a line in the JSONL file
  for (i in seq_along(data_list)) {
    # Convert the list to a JSON string and write it as a line
    writeLines(toJSON(data_list[[i]], auto_unbox = TRUE), con)
  }
  
  # Close the file connection
  close(con)
}

#Read in batch output and clean
read_and_parse_jsonl <- function(file_path) {
    jsonl_data = readLines(file_path)
    parsed_data = lapply(jsonl_data, jsonlite::fromJSON)
    
    results = lapply(
        parsed_data,
        function(x) {
            r = gsub("json|\\n|```", "", x$response$body$choices$message$content)
            r = gsub("'", '"', r)
            as_tibble(jsonlite::fromJSON(r))
        }
    )
    return(results)
}

# Load in data and rename cols
p8 = read_csv("AI_goldstandard_ten_v2.csv") %>% filter(docid != 0) %>% dplyr::select(-...1)

```


```{r create prompts, eval = F}
#I do not evaluate this code because it requires using the scraped texts, which I rescind from the replication dataset
#However, I include the outputs of this exercise, so the reader can continue replicating my code after this chunk

# Create excerpt dataframe
excerpts = p8 %>% select(docid, text) %>% arrange(docid)

# Create a vector of file names
prompt_files <- c("chatgpt_prompt_ten v1.txt",
                  "chatgpt_prompt_ten v1_expl.txt", 
                  "chatgpt_prompt_ten v7 (2).txt", 
                  "chatgpt_prompt_ten v7_expl.txt")

#Prepare prompts with articles in same object
prompt_text = 
  readLines(prompt_files[1], warn = FALSE, 
            encoding = "UTF-8") |>
  paste(collapse = " ") |>
  gsub("  ", " ", x = _) |>
  gsub('[\"]', "'", x = _) 

excerpts$prompt = paste(prompt_text, "<text>", excerpts$text, "</text>")

#Make into chat completion format
# Initialize an empty list to hold all the request objects
requests1 <- list()

# Loop through each prompt and completion
for (i in seq_along(excerpts$prompt)) {
  
  # Define each part of the conversation
  system_part <- list(role = "system", content = "You are a research coder for newspaper articles")
  user_part <- list(role = "user", content = excerpts$prompt[i])
  #assistant_part <- list(role = "assistant", content = [i])
  
  # Combine parts into a single message set
  all_messages <- list(system_part, user_part)
  
  # Create the request object
  request <- list(
    custom_id = paste("request-", i, sep = ""),
    method = "POST",
    url = "/v1/chat/completions",
    body = list(
      model = "gpt-4o-2024-08-06",
      messages = all_messages,
      max_tokens = 1000
    )
  )
  
  # Append the complete request object to the main list
  requests1[[i]] <- request
}

#Prepare prompts with articles in same object
prompt_text = 
  readLines(prompt_files[2], warn = FALSE, 
            encoding = "UTF-8") |>
  paste(collapse = " ") |>
  gsub("  ", " ", x = _) |>
  gsub('[\"]', "'", x = _) 

excerpts$prompt = paste(prompt_text, "<text>", excerpts$text, "</text>")

#Make into chat completion format
# Initialize an empty list to hold all the request objects
requests2 <- list()

# Loop through each prompt and completion
for (i in seq_along(excerpts$prompt)) {
  
  # Define each part of the conversation
  system_part <- list(role = "system", content = "You are a research coder for newspaper articles")
  user_part <- list(role = "user", content = excerpts$prompt[i])
  #assistant_part <- list(role = "assistant", content = [i])
  
  # Combine parts into a single message set
  all_messages <- list(system_part, user_part)
  
  # Create the request object
  request <- list(
    custom_id = paste("request-", i, sep = ""),
    method = "POST",
    url = "/v1/chat/completions",
    body = list(
      model = "gpt-4o-2024-08-06",
      messages = all_messages,
      max_tokens = 1000
    )
  )
  
  # Append the complete request object to the main list
  requests2[[i]] <- request
}

#Prepare prompts with articles in same object
prompt_text = 
  readLines(prompt_files[3], warn = FALSE, 
            encoding = "UTF-8") |>
  paste(collapse = " ") |>
  gsub("  ", " ", x = _) |>
  gsub('[\"]', "'", x = _) 

excerpts$prompt = paste(prompt_text, "<text>", excerpts$text, "</text>")

#Make into chat completion format
# Initialize an empty list to hold all the request objects
requests3 <- list()

# Loop through each prompt and completion
for (i in seq_along(excerpts$prompt)) {
  
  # Define each part of the conversation
  system_part <- list(role = "system", content = "You are a research coder for newspaper articles")
  user_part <- list(role = "user", content = excerpts$prompt[i])
  #assistant_part <- list(role = "assistant", content = [i])
  
  # Combine parts into a single message set
  all_messages <- list(system_part, user_part)
  
  # Create the request object
  request <- list(
    custom_id = paste("request-", i, sep = ""),
    method = "POST",
    url = "/v1/chat/completions",
    body = list(
      model = "gpt-4o-2024-08-06",
      messages = all_messages,
      max_tokens = 1000
    )
  )
  
  # Append the complete request object to the main list
  requests3[[i]] <- request
}

#Prepare prompts with articles in same object
prompt_text = 
  readLines(prompt_files[4], warn = FALSE, 
            encoding = "UTF-8") |>
  paste(collapse = " ") |>
  gsub("  ", " ", x = _) |>
  gsub('[\"]', "'", x = _) 

excerpts$prompt = paste(prompt_text, "<text>", excerpts$text, "</text>")

#Make into chat completion format
# Initialize an empty list to hold all the request objects
requests4 <- list()

# Loop through each prompt and completion
for (i in seq_along(excerpts$prompt)) {
  
  # Define each part of the conversation
  system_part <- list(role = "system", content = "You are a research coder for newspaper articles")
  user_part <- list(role = "user", content = excerpts$prompt[i])
  #assistant_part <- list(role = "assistant", content = [i])
  
  # Combine parts into a single message set
  all_messages <- list(system_part, user_part)
  
  # Create the request object
  request <- list(
    custom_id = paste("request-", i, sep = ""),
    method = "POST",
    url = "/v1/chat/completions",
    body = list(
      model = "gpt-4o-2024-08-06",
      messages = all_messages,
      max_tokens = 1000
    )
  )
  
  # Append the complete request object to the main list
  requests4[[i]] <- request
}

#Write
#write_jsonl(requests1, "prompt1.jsonl")
#write_jsonl(requests2, "prompt1_expl.jsonl")
#write_jsonl(requests3, "prompt7.jsonl")
#write_jsonl(requests4, "prompt7_expl.jsonl")

```


```{r clean and merge output}
#Add in missing row for ease of merging
p8 = p8 %>% add_row(docid = 451) %>% arrange(docid)

#Read in output files
prompt1 = read_and_parse_jsonl("prompt1_output.jsonl")

prompt1 <- lapply(prompt1, function(df) {
  df$prompt <- "v1"
  return(df)
})

prompt1 = map2(prompt1, p8$docid, ~mutate(.x, docid = .y))

prompt7 = read_and_parse_jsonl("prompt7_output.jsonl")

prompt7 <- lapply(prompt7, function(df) {
  df$prompt <- "v7"
  return(df)
})

prompt7 = map2(prompt7, p8$docid, ~mutate(.x, docid = .y))

#Merge
clean_results = bind_rows(prompt1, prompt7) %>% as_tibble() 

#For explanations! 
#Read in batch output and clean
prompt1_expl = lapply(readLines("prompt1_expl_output.jsonl"),
                                fromJSON)

results = c()

for(i in 1:length(prompt1_expl)) {
  results[i] = prompt1_expl[[i]]$response$body$choices$message$content
  
  results = gsub("json|\\n|```", "", results)
  
}

results = as_tibble(results)

# Function to extract the first set of curly brackets and convert to JSON
extract_first_part <- function(row_str) {
  match <- regexpr("\\{[^\\}]*\\}", row_str)  # Match the first set of curly brackets
  first_part <- regmatches(row_str, match)  # Extract the matched part
  
  if (length(first_part) > 0 && nchar(first_part) > 0) {
    # Replace single quotes with double quotes for valid JSON
    first_part <- gsub("'", '"', first_part)
    
    # Parse the string as JSON to convert it to a list
    first_part <- fromJSON(first_part)
  } else {
    first_part <- NA  # Return NA for no match instead of NULL
  }
  
  return(first_part)
}

# Apply the extraction function to each row of the text column using lapply
results <- results %>%
  mutate(value = lapply(value, extract_first_part)) 

# Convert the list column to a dataframe and add in docids
prompt1_expl <- as.data.frame(do.call(rbind, lapply(results$value, as.data.frame)))

prompt1_expl$docid = p8$docid

prompt1_expl$prompt = "v1_expl"

#Now for prompt7
prompt7_expl = lapply(readLines("prompt7_expl_output.jsonl"),
                                fromJSON)

results = c()

for(i in 1:length(prompt7_expl)) {
  results[i] = prompt7_expl[[i]]$response$body$choices$message$content
  
  results = gsub("json|\\n|```", "", results)
  
}

results = as_tibble(results)

# Function to extract the first set of curly brackets and convert to JSON
extract_first_part <- function(row_str) {
  match <- regexpr("\\{[^\\}]*\\}", row_str)  # Match the first set of curly brackets
  first_part <- regmatches(row_str, match)  # Extract the matched part
  
  if (length(first_part) > 0 && nchar(first_part) > 0) {
    # Replace single quotes with double quotes for valid JSON
    first_part <- gsub("'", '"', first_part)
    
    # Parse the string as JSON to convert it to a list
    first_part <- fromJSON(first_part)
  } else {
    first_part <- NA  # Return NA for no match instead of NULL
  }
  
  return(first_part)
}

# Apply the extraction function to each row of the text column using lapply
results <- results %>%
  mutate(value = lapply(value, extract_first_part)) 

# Convert the list column to a dataframe and add in docids
prompt7_expl <- as.data.frame(do.call(rbind, lapply(results$value, as.data.frame)))

prompt7_expl$docid = p8$docid

prompt7_expl$prompt = "v7_expl"

#Merge
clean_results2 = bind_rows(prompt1_expl, prompt7_expl) %>% as_tibble()

#Now merge both 
final_df = bind_rows(clean_results, clean_results2) %>% filter(docid != 0) %>% filter(docid != 451)

#Join in author codings
pi = read_csv("pi_codings_2ndhalf.csv") %>% mutate(prompt = "author") %>% dplyr::select(-...10)

names(pi) = tolower(names(pi))

names(pi)[2:9] = paste0("Theme", 1:8) 

pi = pi %>% mutate(across(Theme1:Theme8, ~ replace_na(., 0))) %>% distinct(docid, .keep_all = T)

```


```{r analyze and visualize output}
# Calculate Hamming loss for each subset based on prompt
hamming_results <- final_df %>%
  group_by(prompt) %>%
  summarise(
    hamming_loss = mean(calculate_hamming_loss(
      as.matrix(select(cur_data(), starts_with("Theme"))),  # Model data for the current prompt
      as.matrix(select(pi, starts_with("Theme")))  # Author-coded data
    )),
    .groups = 'drop'
  )

# Create a bar plot for Hamming loss by prompt
prompt_comparisons = hamming_results %>% 
  mutate(prompt = factor(prompt, levels = c("v1", "v1_expl", "v7", "v7_expl"),
                         labels = c("Earlier Prompt", "Earlier Prompt \nwith Explanations", 
                                    "Most Recent Prompt", "Most Recent Prompt \nwith Explanations"))) %>%
  ggplot(aes(x = prompt, y = hamming_loss)) +
  theme_minimal() +
  geom_bar(stat = "identity", fill = "grey40") +
  labs(title = NULL,
       x = NULL,
       y = "Hamming Loss") +
  scale_y_continuous(breaks = c(0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3), 
                     limits = c(0, 0.3)) + 
  theme(panel.grid.minor = element_blank())

prompt_comparisons

#Save
#ggsave(prompt_comparisons, filename = "prompt_comparisons.png", width = 7, height = 5)

# Make proportion
prop_results <- final_df %>%
  group_by(prompt) %>%
  summarise(
    prop = mean(get_means(
      as.matrix(select(cur_data(), starts_with("Theme"))),  # Model data for the current prompt
      prompt = unique(prompt)  # Pass the current prompt
    )$mean_prop_1s),  # Extract mean proportions from get_means
    .groups = 'drop'
  )

# Create a bar plot for Hamming loss by prompt
prompt_comparisons_prop = prop_results %>% 
  mutate(prop = prop *100, 
         prompt = factor(prompt, levels = c("v1", "v1_expl", "v7", "v7_expl"),
                         labels = c("Earlier Prompt", "Earlier Prompt \nwith Explanations", 
                                    "Most Recent Prompt", "Most Recent Prompt \nwith Explanations"))) %>%
  ggplot(aes(x = prompt, y = prop)) +
  theme_minimal() +
  geom_bar(stat = "identity", fill = "grey40") +
  labs(title = NULL,
       x = NULL,
       y = "% Coded as 1s") +
  scale_y_continuous(breaks = c(0, 5, 10, 15, 20, 25), 
                     limits = c(0, 25)) +
  theme(panel.grid.minor = element_blank())

prompt_comparisons_prop

#Save
#ggsave(prompt_comparisons_prop, filename = "prompt_comparisons_prop.png", width = 7, height = 5)
```

# Compare author codings with undergraduate and LLMs

```{r}

rm(list = ls())

# Function to read and process AI labels
read_and_process <- function(file, date, coder, num_labels, drop_ids = c(0, 451)) {
  read_csv(file) %>%
    mutate(date_coded = date, coder = coder, num_labels = num_labels) %>%
    select(-...1) %>%
    filter(!docid %in% drop_ids) 
}

# GPT datasets
gpt_files <- list(
  gpt590_10lab = c("5-24-24gpt10lab.csv", "05-24-2024", "gpt-4-turbo", "10"),
  gpt590_4lab = c("5-24-24gpt4lab.csv", "05-24-2024", "gpt-4-turbo", "4"),
  gpt_10_oneshot = c("5-24-24gpt10lab_oneshot.csv", "05-24-2024", "gpt-4-turbo", "10"),
  gpt_4_oneshot = c("5-24-24gpt4lab_oneshot.csv", "05-24-2024", "gpt-4-turbo", "4"), 
  gpt_10_fewshot = c("5-24-24gpt10lab_fewshot.csv", "05-24-2024", "gpt-4-turbo", "10"),
  gpt_4_fewshot = c("5-24-24gpt4lab_fewshot.csv", "05-24-2024", "gpt-4-turbo", "4"), 
  gpt4o_10lab = c("5-24-24gpt10lab_4o.csv", "05-24-2024", "gpt-4o", "10"),
  gpt4o_4lab = c("5-24-24gpt4lab_4o.csv", "05-24-2024", "gpt-4o", "4"),
  gpt4o_10lab_oneshot = c("5-24-24gpt10lab_4o_oneshot.csv", "05-24-2024", "gpt-4o", "10"),
  gpt4o_4lab_oneshot = c("5-24-24gpt4lab_4o_oneshot.csv", "05-24-2024", "gpt-4o", "4"),
  gpt4o_10lab_fewshot = c("5-24-24gpt10lab_4o_fewshot.csv", "05-24-2024", "gpt-4o", "10"),
  gpt4o_4lab_fewshot = c("5-24-24gpt4lab_4o_fewshot.csv", "05-24-2024", "gpt-4o", "4") 
)

# Claude datasets
claude_files <- list(
  claude590_10lab = c("8-8-24claude3.510labn590_zeroshot.csv", "8-8-2024", 
                      "claude-sonnet-3.5", "10"),
  claude590_4lab = c("8-9-24claude3.54labn590_zeroshot.csv", "8-9-2024", "claude-sonnet-3.5", "4"),
  claude_10_oneshot = c("8-8-24claude3.510labn590_oneshot.csv", "8-8-2024",
                        "claude-sonnet-3.5", "10"),
  claude_4_oneshot = c("8-8-24claude3.54labn590_oneshot.csv", "8-8-2024", 
                       "claude-sonnet-3.5", "4"),
  claude_10_fewshot = c("6-24-24claude3.510labn590_fewshot.csv", "06-24-2024",
                        "claude-sonnet-3.5", "10"),
  claude_4_fewshot = c("6-24-24claude3.54labn590_fewshot.csv", "06-24-2024", 
                       "claude-sonnet-3.5", "4")
)

# Read and process GPT datasets
gpt_data <- lapply(gpt_files, function(f) read_and_process(f[1], f[2], f[3], f[4]))
names(gpt_data) <- names(gpt_files)

# Read and process Claude datasets
claude_data <- lapply(claude_files, function(f) read_and_process(f[1], f[2], f[3], f[4]))
names(claude_data) <- names(claude_files)

lapply(gpt_data$gpt4o_10lab_fewshot %>% select(Theme1:Theme8), mean)

# Read and process human labels
read_and_process_human <- function(file, num_labels, drop_ids) {
  read_csv(file) %>%
    select(-c(...1)) %>%
    mutate(date_coded = "NA", coder = "human", num_labels = num_labels) %>%
    filter(docid %in% drop_ids) 
}

#Human labels with agreement vars
human_files <- list(
  human_10lab = c("human10_agree.csv", "10"),
  human_4lab = c("human4_agree.csv", "4")
)

ids <- gpt_data$gpt590_10lab$docid
human_data <- lapply(human_files, function(f) read_and_process_human(f[1], f[2], ids))
names(human_data) <- names(human_files)
names(human_data$human_10lab)[2:9] = paste0("Label", 1:8) 
names(human_data$human_4lab)[2:5] = paste0("Label", 1:4) 

lapply(human_data$human_10lab %>% select(Label1:Label8), mean)

#PI codings
pi = read_csv("pi_codings_2ndhalf.csv")

names(pi) = tolower(names(pi))

names(pi)[2:9] = paste0("Label", 1:8) 

pi = pi %>% mutate(across(Label1:Label8, ~ replace_na(., 0))) %>% distinct(docid, .keep_all = T)

lapply(pi %>% select(Label1:Label8), mean)

ids2 = pi$docid

pi10 = pi %>% select(Label1:Label8)

pi10mat = as.matrix(pi10)

#Make 8 label specification into broader 4 label categories
pi4 = pi %>% 
  mutate(hum = if_else(Label1 == 1 | Label2 == 1, 1, 0), 
         threat = if_else(Label3 == 1 | Label4 == 1 | Label5 == 1 | Label6 == 1, 1, 0),
         ben = if_else(Label7 == 1, 1, 0), 
         pi = Label8) %>%
  select(-c(Label1:Label8, ...10, docid)) %>% rename(Label1 = hum, Label2 = threat, Label3 = ben, Label4 = pi)

pi4mat = as.matrix(pi4)

#Calc means for labels 
mean(human_data$human_10lab$Label1); mean(gpt_data$gpt4o_10lab_fewshot$Theme1); mean(pi$Label1)
mean(human_data$human_10lab$Label2); mean(gpt_data$gpt4o_10lab_fewshot$Theme2); mean(pi$Label2)
mean(human_data$human_10lab$Label3); mean(gpt_data$gpt4o_10lab_fewshot$Theme3); mean(pi$Label3)
mean(human_data$human_10lab$Label4); mean(gpt_data$gpt4o_10lab_fewshot$Theme4); mean(pi$Label4)
mean(human_data$human_10lab$Label5); mean(gpt_data$gpt4o_10lab_fewshot$Theme5); mean(pi$Label5)
mean(human_data$human_10lab$Label6); mean(gpt_data$gpt4o_10lab_fewshot$Theme6); mean(pi$Label6)
mean(human_data$human_10lab$Label7); mean(gpt_data$gpt4o_10lab_fewshot$Theme7); mean(pi$Label7)
mean(human_data$human_10lab$Label8); mean(gpt_data$gpt4o_10lab_fewshot$Theme8); mean(pi$Label8)
```


```{r}
# List of models to evaluate
models <- list(
  claude_long = claude_data$claude590_10lab,
  claude_oneshot = claude_data$claude_10_oneshot,
  claude_fewshot = claude_data$claude_10_fewshot,
  gpt_long = gpt_data$gpt590_10lab,
  gpt_oneshot = gpt_data$gpt_10_oneshot,
  gpt_fewshot = gpt_data$gpt_10_fewshot,
  gpt4o_long = gpt_data$gpt4o_10lab,
  gpt4o_oneshot = gpt_data$gpt4o_10lab_oneshot,
  gpt4o_fewshot = gpt_data$gpt4o_10lab_fewshot
)

# Rename columns Theme1:Theme8 to Label1:Label8
gpt10mat <- gpt_data$gpt590_10lab %>% 
  select(Theme1:Theme8) %>% 
  rename_with(~ paste0("Label", 1:8), everything())

claude10mat <- claude_data$claude590_10lab %>% 
  select(Theme1:Theme8) %>% 
  rename_with(~ paste0("Label", 1:8), everything())

gpt4o10mat <- gpt_data$gpt4o_10lab %>% 
  select(Theme1:Theme8) %>% 
  rename_with(~ paste0("Label", 1:8), everything())

gpt10mat_one <- gpt_data$gpt_10_oneshot %>% 
  select(Theme1:Theme8) %>% 
  rename_with(~ paste0("Label", 1:8), everything())

claude10mat_one <- claude_data$claude_10_oneshot %>% 
  select(Theme1:Theme8) %>% 
  rename_with(~ paste0("Label", 1:8), everything())

gpt4o10mat_one <- gpt_data$gpt4o_10lab_oneshot %>% 
  select(Theme1:Theme8) %>% 
  rename_with(~ paste0("Label", 1:8), everything())

gpt10mat_few <- gpt_data$gpt_10_fewshot %>% 
  select(Theme1:Theme8) %>% 
  rename_with(~ paste0("Label", 1:8), everything())

claude10mat_few <- claude_data$claude_10_fewshot %>% 
  select(Theme1:Theme8) %>% 
  rename_with(~ paste0("Label", 1:8), everything())

gpt4o10mat_few <- gpt_data$gpt4o_10lab_fewshot %>% 
  select(Theme1:Theme8) %>% 
  rename_with(~ paste0("Label", 1:8), everything())

hum10mat <- human_data$human_10lab %>% 
  select(Label1:Label8) 

hum4mat <- human_data$human_4lab %>% 
  select(Label1:Label4) 

# Rename columns Theme1:Theme4 to Label1:Label4 for the 4-theme matrices
gpt4mat <- gpt_data$gpt590_4lab %>% 
  select(Theme1:Theme4) %>% 
  rename_with(~ paste0("Label", 1:4), everything())

claude4mat <- claude_data$claude590_4lab %>% 
  select(Theme1:Theme4) %>% 
  rename_with(~ paste0("Label", 1:4), everything())

gpt4o4mat <- gpt_data$gpt4o_4lab %>% 
  select(Theme1:Theme4) %>% 
  rename_with(~ paste0("Label", 1:4), everything())

gpt4mat_one <- gpt_data$gpt_4_oneshot %>% 
  select(Theme1:Theme4) %>% 
  rename_with(~ paste0("Label", 1:4), everything())

claude4mat_one <- claude_data$claude_4_oneshot %>% 
  select(Theme1:Theme4) %>% 
  rename_with(~ paste0("Label", 1:4), everything())

gpt4o4mat_one <- gpt_data$gpt4o_4lab_oneshot %>% 
  select(Theme1:Theme4) %>% 
  rename_with(~ paste0("Label", 1:4), everything())

gpt4mat_few <- gpt_data$gpt_4_fewshot %>% 
  select(Theme1:Theme4) %>% 
  rename_with(~ paste0("Label", 1:4), everything())

claude4mat_few <- claude_data$claude_4_fewshot %>% 
  select(Theme1:Theme4) %>% 
  rename_with(~ paste0("Label", 1:4), everything())

gpt4o4mat_few <- gpt_data$gpt4o_4lab_fewshot %>% 
  select(Theme1:Theme4) %>% 
  rename_with(~ paste0("Label", 1:4), everything())


#Convert to matrices
claude10matx <- as.matrix(claude10mat)
gpt10matx <- as.matrix(gpt10mat)
gpt4o10matx <- as.matrix(gpt4o10mat)
gpt10mat_onex <- as.matrix(gpt10mat_one)
claude10mat_onex <- as.matrix(claude10mat_one)
gpt4o10mat_onex <- as.matrix(gpt4o10mat_one)
gpt10mat_fewx <- as.matrix(gpt10mat_few)
claude10mat_fewx <- as.matrix(claude10mat_few)
gpt4o10mat_fewx <- as.matrix(gpt4o10mat_few)

claude4matx <- as.matrix(claude4mat)
gpt4matx <- as.matrix(gpt4mat)
gpt4o4matx <- as.matrix(gpt4o4mat)
gpt4mat_onex <- as.matrix(gpt4mat_one)
claude4mat_onex <- as.matrix(claude4mat_one)
gpt4o4mat_onex <- as.matrix(gpt4o4mat_one)
gpt4mat_fewx <- as.matrix(gpt4mat_few)
claude4mat_fewx <- as.matrix(claude4mat_few)
gpt4o4mat_fewx <- as.matrix(gpt4o4mat_few)

hum10matx = as.matrix(hum10mat)
hum4matx = as.matrix(hum4mat)

#Use this for black and white
models = c("Claude Sonnet 3.5 (Zero Shot)" = "#A0A0A0", "Claude Sonnet 3.5 (One Shot)" = "#606060", 
           "Claude Sonnet 3.5 (Few Shot)" = "black",
           "GPT 4o (Zero Shot)" = "#A0A0A0", "GPT 4o (One Shot)" = "#606060", "GPT 4o (Few Shot)" = "black",
           "GPT 4 Turbo (Zero Shot)" = "#A0A0A0", "GPT 4 Turbo (One Shot)" = "#606060", 
           "GPT 4 Turbo (Few Shot)" = "gray20", "Undergraduate Coders" = "black"
           )

#Use this for color
models2 = c("Claude Sonnet 3.5 (Zero Shot)" = "#FF9999", "Claude Sonnet 3.5 (One Shot)" = "#FF6666", 
           "Claude Sonnet 3.5 (Few Shot)" = "#CC0000",
           "GPT 4o (Zero Shot)" = "#D8BFD8", "GPT 4o (One Shot)" = "#9370DB", "GPT 4o (Few Shot)" = "#4B0082",
           "GPT 4 Turbo (Zero Shot)" = "#CCFFCC", "GPT 4 Turbo (One Shot)" = "#66CC66", 
           "GPT 4 Turbo (Few Shot)" = "darkgreen", "Undergraduate Coders" = "black"
           )

shapes = c("Claude Sonnet 3.5 (Zero Shot)" = 16, "Claude Sonnet 3.5 (One Shot)" = 16, 
           "Claude Sonnet 3.5 (Few Shot)" = 16,
           "GPT 4o (Zero Shot)" = 15, "GPT 4o (One Shot)" = 15, "GPT 4o (Few Shot)" = 15,
           "GPT 4 Turbo (Zero Shot)" = 17, "GPT 4 Turbo (One Shot)" = 17, 
           "GPT 4 Turbo (Few Shot)" = 17, "Undergraduate Coders" = 8
           )
```


# Means

```{r descriptives}
# Define a function to calculate the mean and save the results
get_means <- function(data, labels, model) {
  values <- apply(data, 2, mean, na.rm = TRUE)
  tibble(mean_prop_1s = values, names = gsub("label_", "", names(values)), labels = labels, 
         model = model)
}

# Process each dataset and combine results
pi10_values = get_means(pi10, 10, "Author")
hum10_values <- get_means(hum10mat, 10, "Undergraduate Coders")
gpt10_values <- get_means(gpt10mat, 10, "GPT 4 Turbo (Zero Shot)")
claude10_values <- get_means(claude10mat, 10, "Claude Sonnet 3.5 (Zero Shot)")
gpt4o10_values <- get_means(gpt4o10mat, 10, "GPT 4o (Zero Shot)")
gpt10_one_values <- get_means(gpt10mat_one, 10, "GPT 4 Turbo (One Shot)")
claude10_one_values <- get_means(claude10mat_one, 10, "Claude Sonnet 3.5 (One Shot)")
gpt4o10_one_values <- get_means(gpt4o10mat_one, 10, "GPT 4o (One Shot)")
gpt10_few_values <- get_means(gpt10mat_few, 10, "GPT 4 Turbo (Few Shot)")
claude10_few_values <- get_means(claude10mat_few, 10, "Claude Sonnet 3.5 (Few Shot)")
gpt4o10_few_values <- get_means(gpt4o10mat_few, 10, "GPT 4o (Few Shot)")

pi4_values = get_means(pi4, 4, "Author")
hum4_values <- get_means(hum4mat, 4, "Undergraduate Coders")
gpt4_values <- get_means(gpt4mat, 4, "GPT 4 Turbo (Zero Shot)")
claude4_values <- get_means(claude4mat, 4, "Claude Sonnet 3.5 (Zero Shot)")
gpt4o4_values <- get_means(gpt4o4mat, 4, "GPT 4o (Zero Shot)")
gpt4_one_values <- get_means(gpt4mat_one, 4, "GPT 4 Turbo (One Shot)")
claude4_one_values <- get_means(claude4mat_one, 4, "Claude Sonnet 3.5 (One Shot)")
gpt4o4_one_values <- get_means(gpt4o4mat_one, 4, "GPT 4o (One Shot)")
gpt4_few_values <- get_means(gpt4mat_few, 4, "GPT 4 Turbo (Few Shot)")
claude4_few_values <- get_means(claude4mat_few, 4, "Claude Sonnet 3.5 (Few Shot)")
gpt4o4_few_values <- get_means(gpt4o4mat_few, 4, "GPT 4o (Few Shot)")

# Combine all the data frames into a single data frame
all_values <- bind_rows(
  hum10_values, gpt10_values, claude10_values, gpt10_few_values, claude10_few_values,
  hum4_values, gpt4_values, claude4_values, gpt4_few_values, claude4_few_values,
  gpt4o10_values, gpt4o10_few_values, gpt4o4_values, gpt4o4_few_values, 
  gpt4_one_values, claude4_one_values, gpt4o4_one_values, gpt10_one_values,
  claude10_one_values, gpt4o10_one_values, pi10_values, pi4_values
)

all_values %>% group_by(model, labels) %>% summarize(mean = mean(mean_prop_1s))

# Filter values for 10 labels and 4 labels
all_values10 <- all_values %>% filter(labels == 10)
all_values4 <- all_values %>% filter(labels == 4)
```


```{r visualize descriptives}
# Create difference variable for 10lab
author_values10 <- all_values10 %>%
  filter(model == "Author") %>%
  select(labels, names, author_value = mean_prop_1s)

# Merge human values back into the main data frame
all_values10 <- all_values10 %>%
  left_join(author_values10, by = c("names", "labels")) %>%
  mutate(diff_from_author = mean_prop_1s - author_value) %>%
  mutate(names = factor(names, levels = rev(unique(names)), labels = rev(c("Vulnerable", "Refugee", "Disease Threat", 
                                                                      "Econ Threat", "Instability", "Crime Threat",  
                                                                      "Econ Benefit", "P & I"))))
  # Arrange by mean_prop_1s in descending order
  #arrange(desc(mean_prop_1s)) %>%  
  # Set factor levels based on the arranged data
  #mutate(names = factor(names, levels = unique(names), labels = rev(c("Econ Benefit", "Econ Threat", "Disease Threat", "Crime Threat", "Refugee", "Instability", "Vulnerable", "P & I"))))  # Keep original names order

# Ensure the model ordering is correct
all_values10$model <- factor(all_values10$model, 
                             levels = c("Undergraduate Coders", "Claude Sonnet 3.5 (Zero Shot)", 
                                        "Claude Sonnet 3.5 (One Shot)", 
                                        "Claude Sonnet 3.5 (Few Shot)", "GPT 4 Turbo (Zero Shot)", 
                                        "GPT 4 Turbo (One Shot)", "GPT 4 Turbo (Few Shot)", 
                                        "GPT 4o (Zero Shot)", "GPT 4o (One Shot)", "GPT 4o (Few Shot)"))

# For 4 lab
author_values4 <- all_values4 %>%
  filter(model == "Author") %>%
  select(labels, names, author_value = mean_prop_1s)

# Merge human values back into the main data frame
all_values4 <- all_values4 %>%
  left_join(author_values4, by = c("names", "labels")) %>%
  mutate(diff_from_author = mean_prop_1s - author_value) %>%
  mutate(names = factor(names, levels = rev(unique(names)), labels = rev(c("Human", "Threat", "Econ Benefit", "P & I"))))

  # Arrange by mean_prop_1s in descending order
  #arrange(desc(mean_prop_1s)) %>%  
  # Set factor levels based on the arranged data
  #mutate(names = factor(names, levels = unique(names), labels = rev(c("Benefit", "Threat", "Human", "P & I"))))  # Keep original names order

# Ensure the model ordering is correct
all_values4$model <- factor(all_values4$model, 
                             levels = c("Undergraduate Coders", 
                                        "Claude Sonnet 3.5 (Zero Shot)", "Claude Sonnet 3.5 (One Shot)", 
                                        "Claude Sonnet 3.5 (Few Shot)", "GPT 4 Turbo (Zero Shot)", 
                                        "GPT 4 Turbo (One Shot)", "GPT 4 Turbo (Few Shot)", 
                                        "GPT 4o (Zero Shot)", "GPT 4o (One Shot)", "GPT 4o (Few Shot)"))

#10lab
av10 = all_values10 %>% filter(model != "Author") %>%
  ggplot(aes(x = mean_prop_1s, y = names, color = model, shape = model)) +
  geom_point(size = 3) +
  theme(axis.text.x = element_text(size = 12), legend.title = element_text(size = 14),
        axis.title.x = element_text(size = 15, margin = margin(t = 5)),
        legend.text = element_text(size = 12), axis.text.y = element_text(size = 12)) +
  labs(title = "8 Label Specification", y = "", 
       x = "% of Articles Classified as False Positives",
       color = "Model:", shape = "Model:") + 
  scale_shape_manual(values = shapes) + geom_vline(xintercept = 0, linetype = "dashed") +
  scale_color_manual(values = models2) + scale_x_continuous(limits = c(-0.01, 0.5), 
                                                           breaks = c(0, 0.1, 0.2, 0.3, 0.4, 0.5))

av10

#4 lab
av4 = all_values4 %>% filter(model != "Undergraduate Coders") %>%
  ggplot(aes(x = mean_prop_1s, y = names, color = model, shape = model)) +
  geom_point(size = 3) +
  theme(axis.text.x = element_text(size = 12), legend.title = element_text(size = 14),
        axis.title.x = element_text(size = 15, margin = margin(t = 5)),
        legend.text = element_text(size = 12), axis.text.y = element_text(size = 12)) +
  labs(title = "4 Label Specification", 
       x = "% of Articles Classified as False Positives", 
       y = "", color = "Model:", shape = "Model:") + 
  scale_shape_manual(values = shapes) + geom_vline(xintercept = 0, linetype = "dashed") +
  scale_color_manual(values = models2) + scale_x_continuous(limits = c(-0.01, 0.5), 
                                                          breaks = c(0, 0.1, 0.2, 0.3, 0.4, 0.5))

av4

# Combine plots and legend
cav <- (av10 + av4 + guides(color = "none", shape = "none")) + 
  plot_layout(ncol = 2, guides = "collect", axes = "collect") & theme(legend.position = "right")

print(cav)

#ggsave("diff_fig3.png", cav, width = 11, height = 6)
```

# Regressions to determine what predicts correctly classified labels

```{r, eval = F}

#Since this chunk requires the raw text, I separately save the datasets without the text data and re-load them in for replication

hum10agree = read_csv("human10_agree.csv")
hum4agree = read_csv("human4_agree.csv")

# Load human data and AI codings for human_10lab
human_10lab = read_csv("AI_goldstandard_ten_v2.csv") %>% 
  select(-c(...1)) %>% 
  filter(docid != 0) %>% 
  mutate(word_count = str_count(text, "\\S+")) %>% 
  mutate(word_count_per100 = word_count / 100) %>%
  left_join(hum10agree %>% select(docid, code_agreement), by = "docid") %>% 
  rename(human_code_agreement = code_agreement)

# Duplicate and rename columns for undergraduate themes (keep original names intact)
names(human_10lab)[4:11] = c("Theme1", "Theme2", "Theme3", "Theme4", "Theme5", 
                             "Theme6", "Theme7", "Theme8")

# Duplicate the Theme columns and rename as "_undergrad"
human_10lab = human_10lab %>%
  mutate(across(starts_with("Theme"), ~ ., .names = "{.col}_undergrad"))

# Now for human_4lab
human_4lab = read_csv("AI_goldstandard_four_v2.csv") %>% 
  select(-c(...1)) %>% 
  filter(docid != 0) %>% 
  mutate(word_count = str_count(text, "\\S+")) %>%
  mutate(word_count_per100 = word_count / 100) %>% 
  left_join(hum4agree %>% select(docid, code_agreement), by = "docid") %>% 
  rename(human_code_agreement = code_agreement)

# Duplicate and rename columns for undergraduate themes (keep original names intact)
names(human_4lab)[3:6] = c("Theme1", "Theme2", "Theme3", "Theme4")

# Duplicate the Theme columns and rename as "_undergrad"
human_4lab = human_4lab %>%
  mutate(across(starts_with("Theme"), ~ ., .names = "{.col}_undergrad"))

# Load the AI data (gpt4o_10_fewshot)
gpt4o_10_fewshot = read_csv("5-24-24gpt10lab_4o_fewshot.csv") %>% 
  select(-c(...1)) %>% 
  filter(docid != 0) %>% 
  left_join(human_10lab %>% 
              dplyr::select(text, docid, word_count_per100, human_code_agreement, ends_with("grad")), 
            by = "docid") %>% 
  mutate(human_code_agreement = if_else(human_code_agreement == "Agree", 1, 0)) %>%
  # Create a single alignment column based on all the themes (1 = undergrads and ai labeled same thing)
  mutate(alignment = if_else(Theme1 == human_10lab$Theme1 & 
                               Theme2 == human_10lab$Theme2 & 
                               Theme3 == human_10lab$Theme3 & 
                               Theme4 == human_10lab$Theme4 & 
                               Theme5 == human_10lab$Theme5 & 
                               Theme6 == human_10lab$Theme6 & 
                               Theme7 == human_10lab$Theme7 & 
                               Theme8 == human_10lab$Theme8, 1, 0)) %>%
  # Create DVs that indicate whether or not the AI labeled an article as a 1 or 0 
  mutate(Theme1_binary = if_else(Theme1 == 1, 1, 0)) %>%
  mutate(Theme2_binary = if_else(Theme2 == 1, 1, 0)) %>%
  mutate(Theme3_binary = if_else(Theme3 == 1, 1, 0)) %>%
  mutate(Theme4_binary = if_else(Theme4 == 1, 1, 0)) %>%
  mutate(Theme5_binary = if_else(Theme5 == 1, 1, 0)) %>%
  mutate(Theme6_binary = if_else(Theme6 == 1, 1, 0)) %>%
  mutate(Theme7_binary = if_else(Theme7 == 1, 1, 0)) %>%
  mutate(Theme8_binary = if_else(Theme8 == 1, 1, 0))

gpt4o_4_fewshot = read_csv("5-24-24gpt4lab_4o_fewshot.csv") %>% 
  select(-c(...1)) %>% 
  filter(docid != 0) %>% 
  left_join(human_4lab %>% 
              dplyr::select(text, docid, word_count_per100, human_code_agreement, ends_with("grad")), 
            by = "docid") %>% 
  mutate(human_code_agreement = if_else(human_code_agreement == "Agree", 1, 0)) %>%
  # Create a single alignment column based on all the themes (1 = undergrads and ai labeled same thing)
  mutate(alignment = if_else(Theme1 == human_4lab$Theme1 & 
                               Theme2 == human_4lab$Theme2 & 
                               Theme3 == human_4lab$Theme3 & 
                               Theme4 == human_4lab$Theme4, 1, 0)) %>%
  # Create DVs that indicate whether or not the AI labeled an article as a 1 or 0
  mutate(Theme1_binary = if_else(Theme1 == 1, 1, 0)) %>%
  mutate(Theme2_binary = if_else(Theme2 == 1, 1, 0)) %>%
  mutate(Theme3_binary = if_else(Theme3 == 1, 1, 0)) %>%
  mutate(Theme4_binary = if_else(Theme4 == 1, 1, 0)) 

rm("hum10agree"); rm("hum4agree")

#Variable for mislabeled articles

#8 label... Loop through each theme variable and create the comparison column
# "TRUE" in this var means that the AI labeled the text differently than the humans
#theme_vars <- paste0("Theme", 1:8)

#for (theme_var in theme_vars) {
#  agree_col <- paste0("undergradAI_t", substring(theme_var, 6))
#  gpt4o_10_fewshot <- gpt4o_10_fewshot %>%
#    mutate(!!agree_col := .data[[theme_var]] == human_10lab[[theme_var]])
#}

#Now 4 label
#theme_vars4 <- paste0("Theme", 1:4)

#for (theme_var in theme_vars4) {
#  agree_col4 <- paste0("undergradAI_t", substring(theme_var, 6))
#  gpt4o_4_fewshot <- gpt4o_4_fewshot %>%
#    mutate(!!agree_col4 := .data[[theme_var]] == human_4lab[[theme_var]])
#}

#Variable for text complexity
texts = human_10lab$text

# Create empty vector to store results
readability_scores <- numeric(length(texts)) #Fernandez-Huerta scores
readability_scores2 <- numeric(length(texts)) #Szigriszt scores

# Loop through each text....takes a couple minutes
for(i in seq_along(texts)) {
    # Tokenize the current text string directly
    tok <- tokenize(texts[i], lang = "es", format = "obj")
    # Calculate readability
    read_score <- flesch(tok, parameters = "es")
    read_score2 <- flesch(tok, parameters = "es-s")
    # Store the score
    readability_scores[i] <- read_score@Flesch$RE
    readability_scores2[i] <- read_score2@Flesch$RE
}

# Create final dataframe
texts_dfm <- data.frame(
  docid = human_10lab$docid,
  flesch_score = readability_scores, 
  flesch_score2 = readability_scores2
)

mean(texts_dfm$flesch_score) #60.6, right at the border between a US 7th-8th grade reading level and a 9th-10th grade reading level on average

mean(texts_dfm$flesch_score2) #56.5, right in the middle of US 7th-8th grade reading level

#Read in PI codings
#PI codings
pi = read_csv("pi_codings_2ndhalf.csv")

names(pi) = tolower(names(pi))

names(pi)[2:9] = paste0("Theme", 1:8) 

pi = pi %>% mutate(across(Theme1:Theme8, ~ replace_na(., 0))) %>% distinct(docid, .keep_all = T)

pi10 = pi %>% dplyr::select(Theme1:Theme8, docid) %>% 
  rename(Theme1_author = Theme1, Theme2_author = Theme2, Theme3_author = Theme3, Theme4_author = Theme4,
         Theme5_author = Theme5, Theme6_author = Theme6, Theme7_author = Theme7, Theme8_author = Theme8)

#Make 8 label specification into broader 4 label categories
pi4 = pi %>% 
  mutate(hum = if_else(Theme1 == 1 | Theme2 == 1, 1, 0), 
         threat = if_else(Theme3 == 1 | Theme4 == 1 | Theme5 == 1 | Theme6 == 1, 1, 0),
         ben = if_else(Theme7 == 1, 1, 0), 
         pi = Theme8) %>%
  select(-c(Theme1:Theme8, ...10)) %>% rename(Theme1_author = hum, Theme2_author = threat,
                                                     Theme3_author = ben, Theme4_author = pi)

#Join text complexity var and author codings var
gpt4o_10_fewshot = left_join(gpt4o_10_fewshot, texts_dfm, by = "docid") %>% left_join(pi10, by = "docid") %>%
  mutate(
    pi_alignment_theme1 = if_else(Theme1 == Theme1_author, 1, 0),
    pi_alignment_theme2 = if_else(Theme2 == Theme2_author, 1, 0),
    pi_alignment_theme3 = if_else(Theme3 == Theme3_author, 1, 0),
    pi_alignment_theme4 = if_else(Theme4 == Theme4_author, 1, 0),
    pi_alignment_theme5 = if_else(Theme5 == Theme5_author, 1, 0),
    pi_alignment_theme6 = if_else(Theme6 == Theme6_author, 1, 0),
    pi_alignment_theme7 = if_else(Theme7 == Theme7_author, 1, 0),
    pi_alignment_theme8 = if_else(Theme8 == Theme8_author, 1, 0)
  )

gpt4o_4_fewshot = left_join(gpt4o_4_fewshot, texts_dfm, by = "docid") %>% left_join(pi4, by = "docid") %>%
  mutate(
    pi_alignment_theme1 = if_else(Theme1 == Theme1_author, 1, 0),
    pi_alignment_theme2 = if_else(Theme2 == Theme2_author, 1, 0),
    pi_alignment_theme3 = if_else(Theme3 == Theme3_author, 1, 0),
    pi_alignment_theme4 = if_else(Theme4 == Theme4_author, 1, 0)
  )

#Save for replication
write.csv(gpt4o_10_fewshot %>% dplyr::select(-c(text)), "gpt4o_10_fewshot_notext.csv")

write.csv(gpt4o_4_fewshot %>% dplyr::select(-c(text)), "gpt4o_4_fewshot_notext.csv")

write.csv(human_10lab %>% dplyr::select(-c(text)), "human10lab_notext.csv")

write.csv(human_4lab %>% dplyr::select(-c(text)), "human4lab_notext.csv")

```

## Models with labeling as 1s as benchmark

```{r 10 label models}
rm(list = ls())

#Load text-free data back in
gpt4o_10_fewshot = read_csv("gpt4o_10_fewshot_notext.csv")

gpt4o_4_fewshot = read_csv("gpt4o_4_fewshot_notext.csv")

human10lab = read_csv("human10lab_notext.csv")
  
human4lab = read_csv("human4lab_notext.csv")

#Run regressions
t1_10 = lm(Theme1_binary ~ word_count_per100 + flesch_score + human_code_agreement, 
           data = gpt4o_10_fewshot)

#summary(t1_10)

t2_10 = lm(Theme2_binary ~ word_count_per100 + flesch_score + human_code_agreement, 
           data = gpt4o_10_fewshot)

#summary(t2_10)

t3_10 = lm(Theme3_binary ~ word_count_per100 + flesch_score + human_code_agreement, 
           data = gpt4o_10_fewshot)

#summary(t3_10)

t4_10 = lm(Theme4_binary ~ word_count_per100 + flesch_score + human_code_agreement, 
           data = gpt4o_10_fewshot)

#summary(t4_10)

t5_10 = lm(Theme5_binary ~ word_count_per100 + flesch_score + human_code_agreement, 
           data = gpt4o_10_fewshot)

#summary(t5_10)

t6_10 = lm(Theme6_binary ~ word_count_per100 + flesch_score + human_code_agreement, 
           data = gpt4o_10_fewshot)

#summary(t6_10)

t7_10 = lm(Theme7_binary ~ word_count_per100 + flesch_score + human_code_agreement, 
           data = gpt4o_10_fewshot)

#summary(t7_10)

t8_10 = lm(Theme8_binary ~ word_count_per100 + flesch_score + human_code_agreement, 
           data = gpt4o_10_fewshot)

#summary(t8_10)

#Run regressions with alt spec for Szigriszt-Pazos index
#t1_10 = lm(Theme1_binary ~ word_count_per100 + flesch_score2 + human_code_agreement, 
#           data = gpt4o_10_fewshot)

#summary(t1_10)

#t2_10 = lm(Theme2_binary ~ word_count_per100 + flesch_score2 + human_code_agreement, 
#           data = gpt4o_10_fewshot)

#summary(t2_10)

#t3_10 = lm(Theme3_binary ~ word_count_per100 + flesch_score2 + human_code_agreement, 
#           data = gpt4o_10_fewshot)

#summary(t3_10)

#t4_10 = lm(Theme4_binary ~ word_count_per100 + flesch_score2 + human_code_agreement, 
#           data = gpt4o_10_fewshot)

#summary(t4_10)

#t5_10 = lm(Theme5_binary ~ word_count_per100 + flesch_score2 + human_code_agreement, 
#           data = gpt4o_10_fewshot)

#summary(t5_10)

#t6_10 = lm(Theme6_binary ~ word_count_per100 + flesch_score2 + human_code_agreement, 
#           data = gpt4o_10_fewshot)

#summary(t6_10)

#t7_10 = lm(Theme7_binary ~ word_count_per100 + flesch_score2 + human_code_agreement, 
#           data = gpt4o_10_fewshot)

#summary(t7_10)

#t8_10 = lm(Theme8_binary ~ word_count_per100 + flesch_score2 + human_code_agreement, 
#           data = gpt4o_10_fewshot)

#summary(t8_10)

stargazer(t1_10, t2_10, t3_10, t4_10, t5_10, t6_10, t7_10, t8_10, digits = 2,
          covariate.labels = c("Word Count (By 100)", "Text Readability", 
                               "Undergrad Agreement", "Intercept"), 
          dep.var.caption = "", dep.var.labels = c("Vulnerable", "Refugee", "Disease Threat",
                                                   "Econ Threat", "Instability", "Crime Threat",
                                                   "Econ Benefit", "P & I"), 
          omit.stat = c("f", "ser"))
```


```{r 4 label models}
#Run regressions
t1_4 = lm(Theme1_binary ~ word_count_per100 + flesch_score + human_code_agreement, 
          data = gpt4o_4_fewshot)

#summary(t1_4)

t2_4 = lm(Theme2_binary ~ word_count_per100 + flesch_score + human_code_agreement, 
          data = gpt4o_4_fewshot)

#summary(t2_4)

t3_4 = lm(Theme3_binary ~ word_count_per100 + flesch_score + human_code_agreement, 
          data = gpt4o_4_fewshot)

#summary(t3_4)

t4_4 = lm(Theme4_binary ~ word_count_per100 + flesch_score + human_code_agreement, 
          data = gpt4o_4_fewshot)

#summary(t4_4)

#Run regressions with alt spec for Szigriszt-Pazos index
#t1_4 = lm(Theme1_binary ~ word_count_per100 + flesch_score2 + human_code_agreement, 
#          data = gpt4o_4_fewshot)

#summary(t1_4)

#t2_4 = lm(Theme2_binary ~ word_count_per100 + flesch_score2 + human_code_agreement, 
#          data = gpt4o_4_fewshot)

#summary(t2_4)

#t3_4 = lm(Theme3_binary ~ word_count_per100 + flesch_score2 + human_code_agreement, 
#          data = gpt4o_4_fewshot)

#summary(t3_4)

#t4_4 = lm(Theme4_binary ~ word_count_per100 + flesch_score2 + human_code_agreement, 
#          data = gpt4o_4_fewshot)

#summary(t4_4)

stargazer(t1_4, t2_4, t3_4, t4_4, digits = 2, 
          covariate.labels = c("Word Count (By 100)", "Text Readability", "Undergrad Agreement",
                               "Intercept"), 
          dep.var.caption = "", dep.var.labels = c("Human", "Threat", "Econ Benefit", "P & I"), 
          omit.stat = c("f", "ser"))
```

## Models with PI as benchmark

```{r 10 label models}
#Run regressions
t1_10 = lm(pi_alignment_theme1 ~ word_count_per100 + flesch_score + human_code_agreement, 
           data = gpt4o_10_fewshot)

#summary(t1_10)

t2_10 = lm(pi_alignment_theme2 ~ word_count_per100 + flesch_score + human_code_agreement, 
           data = gpt4o_10_fewshot)

#summary(t2_10)

t3_10 = lm(pi_alignment_theme3 ~ word_count_per100 + flesch_score + human_code_agreement, 
           data = gpt4o_10_fewshot)

#summary(t3_10)

t4_10 = lm(pi_alignment_theme4 ~ word_count_per100 + flesch_score + human_code_agreement, 
           data = gpt4o_10_fewshot)

#summary(t4_10)

t5_10 = lm(pi_alignment_theme5 ~ word_count_per100 + flesch_score + human_code_agreement, 
           data = gpt4o_10_fewshot)

#summary(t5_10)

t6_10 = lm(pi_alignment_theme6 ~ word_count_per100 + flesch_score + human_code_agreement, 
           data = gpt4o_10_fewshot)

#summary(t6_10)

t7_10 = lm(pi_alignment_theme7 ~ word_count_per100 + flesch_score + human_code_agreement, 
           data = gpt4o_10_fewshot)

#summary(t7_10)

t8_10 = lm(pi_alignment_theme8 ~ word_count_per100 + flesch_score + human_code_agreement, 
           data = gpt4o_10_fewshot)

#summary(t8_10)

#Run regressions with alt spec for Szigriszt-Pazos index
#t1_10 = lm(pi_alignment_theme1 ~ word_count_per100 + flesch_score2 + human_code_agreement, 
#           data = gpt4o_10_fewshot)

#summary(t1_10)

#t2_10 = lm(pi_alignment_theme2 ~ word_count_per100 + flesch_score2 + human_code_agreement, 
#           data = gpt4o_10_fewshot)

#summary(t2_10)

#t3_10 = lm(pi_alignment_theme3 ~ word_count_per100 + flesch_score2 + human_code_agreement, 
#           data = gpt4o_10_fewshot)

#summary(t3_10)

#t4_10 = lm(pi_alignment_theme4 ~ word_count_per100 + flesch_score2 + human_code_agreement, 
#           data = gpt4o_10_fewshot)

#summary(t4_10)

#t5_10 = lm(pi_alignment_theme5 ~ word_count_per100 + flesch_score2 + human_code_agreement, 
#           data = gpt4o_10_fewshot)

#summary(t5_10)

#t6_10 = lm(pi_alignment_theme6 ~ word_count_per100 + flesch_score2 + human_code_agreement, 
#           data = gpt4o_10_fewshot)

#summary(t6_10)

#t7_10 = lm(pi_alignment_theme7 ~ word_count_per100 + flesch_score2 + human_code_agreement, 
#           data = gpt4o_10_fewshot)

#summary(t7_10)

#t8_10 = lm(pi_alignment_theme8 ~ word_count_per100 + flesch_score2 + human_code_agreement, 
#           data = gpt4o_10_fewshot)

#summary(t8_10)

stargazer(t1_10, t2_10, t3_10, t4_10, t5_10, t6_10, t7_10, t8_10, digits = 2, 
          covariate.labels = c("Word Count (By 100)", "Text Readability", 
                               "Undergrad Agreement", "Intercept"), 
          dep.var.caption = "", dep.var.labels = c("Vulnerable", "Refugee", "Disease Threat",
                                                   "Econ Threat", "Instability", "Crime Threat",
                                                   "Econ Benefit", "P & I"), 
          omit.stat = c("f", "ser"))
```


```{r 4 label models}
#Run regressions
t1_4 = lm(pi_alignment_theme1 ~ word_count_per100 + flesch_score + human_code_agreement, 
          data = gpt4o_4_fewshot)

#summary(t1_4)

t2_4 = lm(pi_alignment_theme2 ~ word_count_per100 + flesch_score + human_code_agreement, 
          data = gpt4o_4_fewshot)

#summary(t2_4)

t3_4 = lm(pi_alignment_theme3 ~ word_count_per100 + flesch_score + human_code_agreement, 
          data = gpt4o_4_fewshot)

#summary(t3_4)

t4_4 = lm(pi_alignment_theme4 ~ word_count_per100 + flesch_score + human_code_agreement, 
          data = gpt4o_4_fewshot)

#Run regressions with alt spec for Szigriszt-Pazos index
#t1_4 = lm(pi_alignment_theme1 ~ word_count_per100 + flesch_score2 + human_code_agreement, 
#          data = gpt4o_4_fewshot)

#summary(t1_4)

#t2_4 = lm(pi_alignment_theme2 ~ word_count_per100 + flesch_score2 + human_code_agreement, 
#          data = gpt4o_4_fewshot)

#summary(t2_4)

#t3_4 = lm(pi_alignment_theme3 ~ word_count_per100 + flesch_score2 + human_code_agreement, 
#          data = gpt4o_4_fewshot)

#summary(t3_4)

#t4_4 = lm(pi_alignment_theme4 ~ word_count_per100 + flesch_score2 + human_code_agreement, 
#          data = gpt4o_4_fewshot)

#summary(t4_4)

stargazer(t1_4, t2_4, t3_4, t4_4, digits = 2, 
          covariate.labels = c("Word Count (By 100)", "Text Readability", "Undergrad Agreement",
                               "Intercept"), 
          dep.var.caption = "", dep.var.labels = c("Human", "Threat", "Econ Benefit", "P & I"), 
          omit.stat = c("f", "ser"))
```

## Create tables for false positives and negatives

```{r}
# For 4-label dataset
# First for AI predictions
aggregate_ai_4lab <- gpt4o_4_fewshot %>%
  summarise(
    TP = sum(Theme1 == 1 & Theme1_author == 1) + 
         sum(Theme2 == 1 & Theme2_author == 1) +
         sum(Theme3 == 1 & Theme3_author == 1) +
         sum(Theme4 == 1 & Theme4_author == 1),
    FP = sum(Theme1 == 1 & Theme1_author == 0) +
         sum(Theme2 == 1 & Theme2_author == 0) +
         sum(Theme3 == 1 & Theme3_author == 0) +
         sum(Theme4 == 1 & Theme4_author == 0),
    FN = sum(Theme1 == 0 & Theme1_author == 1) +
         sum(Theme2 == 0 & Theme2_author == 1) +
         sum(Theme3 == 0 & Theme3_author == 1) +
         sum(Theme4 == 0 & Theme4_author == 1),
    TN = sum(Theme1 == 0 & Theme1_author == 0) +
         sum(Theme2 == 0 & Theme2_author == 0) +
         sum(Theme3 == 0 & Theme3_author == 0) +
         sum(Theme4 == 0 & Theme4_author == 0)
  )

# Then for undergraduate predictions in 4-label
aggregate_undergrad_4lab <- gpt4o_4_fewshot %>%
  summarise(
    TP = sum(Theme1_undergrad == 1 & Theme1_author == 1) +
         sum(Theme2_undergrad == 1 & Theme2_author == 1) +
         sum(Theme3_undergrad == 1 & Theme3_author == 1) +
         sum(Theme4_undergrad == 1 & Theme4_author == 1),
    FP = sum(Theme1_undergrad == 1 & Theme1_author == 0) +
         sum(Theme2_undergrad == 1 & Theme2_author == 0) +
         sum(Theme3_undergrad == 1 & Theme3_author == 0) +
         sum(Theme4_undergrad == 1 & Theme4_author == 0),
    FN = sum(Theme1_undergrad == 0 & Theme1_author == 1) +
         sum(Theme2_undergrad == 0 & Theme2_author == 1) +
         sum(Theme3_undergrad == 0 & Theme3_author == 1) +
         sum(Theme4_undergrad == 0 & Theme4_author == 1),
    TN = sum(Theme1_undergrad == 0 & Theme1_author == 0) +
         sum(Theme2_undergrad == 0 & Theme2_author == 0) +
         sum(Theme3_undergrad == 0 & Theme3_author == 0) +
         sum(Theme4_undergrad == 0 & Theme4_author == 0)
  )

# For 10-label dataset (8 themes)
# First for AI predictions
aggregate_ai_10lab <- gpt4o_10_fewshot %>%
  summarise(
    TP = sum(Theme1 == 1 & Theme1_author == 1) +
         sum(Theme2 == 1 & Theme2_author == 1) +
         sum(Theme3 == 1 & Theme3_author == 1) +
         sum(Theme4 == 1 & Theme4_author == 1) +
         sum(Theme5 == 1 & Theme5_author == 1) +
         sum(Theme6 == 1 & Theme6_author == 1) +
         sum(Theme7 == 1 & Theme7_author == 1) +
         sum(Theme8 == 1 & Theme8_author == 1),
    FP = sum(Theme1 == 1 & Theme1_author == 0) +
         sum(Theme2 == 1 & Theme2_author == 0) +
         sum(Theme3 == 1 & Theme3_author == 0) +
         sum(Theme4 == 1 & Theme4_author == 0) +
         sum(Theme5 == 1 & Theme5_author == 0) +
         sum(Theme6 == 1 & Theme6_author == 0) +
         sum(Theme7 == 1 & Theme7_author == 0) +
         sum(Theme8 == 1 & Theme8_author == 0),
    FN = sum(Theme1 == 0 & Theme1_author == 1) +
         sum(Theme2 == 0 & Theme2_author == 1) +
         sum(Theme3 == 0 & Theme3_author == 1) +
         sum(Theme4 == 0 & Theme4_author == 1) +
         sum(Theme5 == 0 & Theme5_author == 1) +
         sum(Theme6 == 0 & Theme6_author == 1) +
         sum(Theme7 == 0 & Theme7_author == 1) +
         sum(Theme8 == 0 & Theme8_author == 1),
    TN = sum(Theme1 == 0 & Theme1_author == 0) +
         sum(Theme2 == 0 & Theme2_author == 0) +
         sum(Theme3 == 0 & Theme3_author == 0) +
         sum(Theme4 == 0 & Theme4_author == 0) +
         sum(Theme5 == 0 & Theme5_author == 0) +
         sum(Theme6 == 0 & Theme6_author == 0) +
         sum(Theme7 == 0 & Theme7_author == 0) +
         sum(Theme8 == 0 & Theme8_author == 0)
  )

# Then for undergraduate predictions in 10-label
aggregate_undergrad_10lab <- gpt4o_10_fewshot %>%
  summarise(
    TP = sum(Theme1_undergrad == 1 & Theme1_author == 1) +
         sum(Theme2_undergrad == 1 & Theme2_author == 1) +
         sum(Theme3_undergrad == 1 & Theme3_author == 1) +
         sum(Theme4_undergrad == 1 & Theme4_author == 1) +
         sum(Theme5_undergrad == 1 & Theme5_author == 1) +
         sum(Theme6_undergrad == 1 & Theme6_author == 1) +
         sum(Theme7_undergrad == 1 & Theme7_author == 1) +
         sum(Theme8_undergrad == 1 & Theme8_author == 1),
    FP = sum(Theme1_undergrad == 1 & Theme1_author == 0) +
         sum(Theme2_undergrad == 1 & Theme2_author == 0) +
         sum(Theme3_undergrad == 1 & Theme3_author == 0) +
         sum(Theme4_undergrad == 1 & Theme4_author == 0) +
         sum(Theme5_undergrad == 1 & Theme5_author == 0) +
         sum(Theme6_undergrad == 1 & Theme6_author == 0) +
         sum(Theme7_undergrad == 1 & Theme7_author == 0) +
         sum(Theme8_undergrad == 1 & Theme8_author == 0),
    FN = sum(Theme1_undergrad == 0 & Theme1_author == 1) +
         sum(Theme2_undergrad == 0 & Theme2_author == 1) +
         sum(Theme3_undergrad == 0 & Theme3_author == 1) +
         sum(Theme4_undergrad == 0 & Theme4_author == 1) +
         sum(Theme5_undergrad == 0 & Theme5_author == 1) +
         sum(Theme6_undergrad == 0 & Theme6_author == 1) +
         sum(Theme7_undergrad == 0 & Theme7_author == 1) +
         sum(Theme8_undergrad == 0 & Theme8_author == 1),
    TN = sum(Theme1_undergrad == 0 & Theme1_author == 0) +
         sum(Theme2_undergrad == 0 & Theme2_author == 0) +
         sum(Theme3_undergrad == 0 & Theme3_author == 0) +
         sum(Theme4_undergrad == 0 & Theme4_author == 0) +
         sum(Theme5_undergrad == 0 & Theme5_author == 0) +
         sum(Theme6_undergrad == 0 & Theme6_author == 0) +
         sum(Theme7_undergrad == 0 & Theme7_author == 0) +
         sum(Theme8_undergrad == 0 & Theme8_author == 0)
  )
```

#Analyze prevalence of 1s and 0s in random versus fixed order prompts

```{r read back in}
# Function to read and process AI labels
read_and_process <- function(file, date, coder, num_labels, drop_ids = c(0, 451)) {
  read_csv(file) %>%
    mutate(type = date, coder = coder, num_labels = num_labels) %>%
    select(-...1) %>%
    filter(!docid %in% drop_ids)
}

# OG datasets
ogs <- list(
  gpt_10lab = c("5-24-24gpt10lab.csv", "OG", "gpt-4-turbo", "10"),
  gpt_4lab = c("5-24-24gpt4lab.csv", "OG", "gpt-4-turbo", "4"),
  gpt_10_oneshot = c("5-24-24gpt10lab_oneshot.csv", "OG", "gpt-4-turbo", "10"),
  gpt_4_oneshot = c("5-24-24gpt4lab_oneshot.csv", "OG", "gpt-4-turbo", "4"), 
  gpt_10_fewshot = c("5-24-24gpt10lab_fewshot.csv", "OG", "gpt-4-turbo", "10"),
  gpt_4_fewshot = c("5-24-24gpt4lab_fewshot.csv", "OG", "gpt-4-turbo", "4"), 
  gpt4o_10lab = c("5-24-24gpt10lab.csv", "OG", "gpt-4o", "10"),
  gpt4o_4lab = c("5-24-24gpt4lab.csv", "OG", "gpt-4o", "4"),
  gpt4o_10lab_oneshot = c("5-24-24gpt10lab_4o_oneshot.csv", "OG", "gpt-4o", "10"),
  gpt4o_4lab_oneshot = c("5-24-24gpt4lab_4o_oneshot.csv", "OG", "gpt-4o", "4"),
  gpt4o_10lab_fewshot = c("5-24-24gpt10lab_4o_fewshot.csv", "OG", "gpt-4o", "10"),
  gpt4o_4lab_fewshot = c("5-24-24gpt4lab_4o_fewshot.csv", "OG", "gpt-4o", "4"), 
  claude_10lab = c("8-8-24claude3.510labn590_zeroshot.csv", "OG", "claude-sonnet-3.5", "10"),
  claude_4lab = c("8-9-24claude3.54labn590_zeroshot.csv", "OG", "claude-sonnet-3.5", "4"),
  claude_10_oneshot = c("8-8-24claude3.510labn590_oneshot.csv", "OG",
                        "claude-sonnet-3.5", "10"),
  claude_4_oneshot = c("8-8-24claude3.54labn590_oneshot.csv", "OG", 
                       "claude-sonnet-3.5", "4"),
  claude_10_fewshot = c("6-24-24claude3.510labn590_fewshot.csv", "OG",
                        "claude-sonnet-3.5", "10"),
  claude_4_fewshot = c("6-24-24claude3.54labn590_fewshot.csv", "OG", 
                       "claude-sonnet-3.5", "4")
)

# Random datasets
rands <- list(
  gpt_10lab = c("5-31-24gpt10zero_4turbo_rand.csv", "RAND", "gpt-4-turbo", "10"),
  gpt_4lab = c("5-31-24gpt4zero_4turbo_rand.csv", "RAND", "gpt-4-turbo", "4"),
  gpt_10_oneshot = c("5-31-24gpt10one_4turbo_rand.csv", "RAND", "gpt-4-turbo", "10"),
  gpt_4_oneshot = c("5-31-24gpt4one_4turbo_rand.csv", "RAND", "gpt-4-turbo", "4"), 
  gpt_10_fewshot = c("5-31-24gpt10few_4turbo_rand.csv", "RAND", "gpt-4-turbo", "10"),
  gpt_4_fewshot = c("5-31-24gpt4few_4turbo_rand.csv", "RAND", "gpt-4-turbo", "4"), 
  gpt4o_10lab = c("5-31-24gpt10zero_4o_rand.csv", "RAND", "gpt-4o", "10"),
  gpt4o_4lab = c("5-31-24gpt4zero_4o_rand.csv", "RAND", "gpt-4o", "4"),
  gpt4o_10lab_oneshot = c("5-31-24gpt10one_4o_rand.csv", "RAND", "gpt-4o", "10"),
  gpt4o_4lab_oneshot = c("5-31-24gpt4one_4o_rand.csv", "RAND", "gpt-4o", "4"),
  gpt4o_10lab_fewshot = c("5-31-24gpt10few_4o_rand.csv", "RAND", "gpt-4o", "10"),
  gpt4o_4lab_fewshot = c("5-31-24gpt4few_4o_rand.csv", "RAND", "gpt-4o", "4"), 
  claude_10lab = c("8-7-24claudesonnet3.5zero_rand10.csv", "RAND", "claude-sonnet-3.5", "10"),
  claude_4lab = c("8-9-24claudesonnet3.5zero_rand4.csv", "RAND", "claude-sonnet-3.5", "4"),
  claude_10_oneshot = c("8-7-24claudesonnet3.5one_rand10.csv", "RAND",
                        "claude-sonnet-3.5", "10"),
  claude_4_oneshot = c("8-11-24claudesonnet3.5one_rand4.csv", "RAND", 
                       "claude-sonnet-3.5", "4"),
  claude_10_fewshot = c("8-7-24claudesonnet3.5few_rand10.csv", "RAND",
                        "claude-sonnet-3.5", "10"),
  claude_4_fewshot = c("8-11-24claudesonnet3.5few_rand4.csv", "RAND", 
                       "claude-sonnet-3.5", "4")
)

# Read and process random new datasets
rand_data <- lapply(rands, function(f) read_and_process(f[1], f[2], f[3], f[4]))
names(rand_data) <- names(rands)

# Read and process OG datasets
og_data <- lapply(ogs, function(f) read_and_process(f[1], f[2], f[3], f[4]))
names(og_data) <- names(ogs)
```


```{r make into matrices}
# Define a function to calculate the mean and save the results
get_means <- function(data, labels, model) {
  values <- apply(data, 2, mean, na.rm = TRUE)
  tibble(mean_prop_1s = values, names = gsub("label_", "", names(values)), labels = labels, 
         model = model)
}

# Extracting the necessary variables from the datasets
extract_data <- function(data, themes) {
  data %>% select(starts_with(themes))
}

#OG
gpt10mat_og <- extract_data(og_data$gpt_10lab, "Theme")
claude10mat_og <- extract_data(og_data$claude_10lab, "Theme")
gpt4o10mat_og <- extract_data(og_data$gpt4o_10lab, "Theme")
gpt10mat_one_og <- extract_data(og_data$gpt_10_oneshot, "Theme")
claude10mat_one_og <- extract_data(og_data$claude_10_oneshot, "Theme")
gpt4o10mat_one_og <- extract_data(og_data$gpt4o_10lab_oneshot, "Theme")
gpt10mat_few_og <- extract_data(og_data$gpt_10_fewshot, "Theme")
claude10mat_few_og <- extract_data(og_data$claude_10_fewshot, "Theme")
gpt4o10mat_few_og <- extract_data(og_data$gpt4o_10lab_fewshot, "Theme")

gpt4mat_og <- extract_data(og_data$gpt_4lab, "Theme")
claude4mat_og <- extract_data(og_data$claude_4lab, "Theme")
gpt4o4mat_og <- extract_data(og_data$gpt4o_4lab, "Theme")
gpt4mat_one_og <- extract_data(og_data$gpt_4_oneshot, "Theme")
claude4mat_one_og <- extract_data(og_data$claude_4_oneshot, "Theme")
gpt4o4mat_one_og <- extract_data(og_data$gpt4o_4lab_oneshot, "Theme")
gpt4mat_few_og <- extract_data(og_data$gpt_4_fewshot, "Theme")
claude4mat_few_og <- extract_data(og_data$claude_4_fewshot, "Theme")
gpt4o4mat_few_og <- extract_data(og_data$gpt4o_4lab_fewshot, "Theme")

#RAND
gpt10mat <- extract_data(rand_data$gpt_10lab, "Theme")
claude10mat <- extract_data(rand_data$claude_10lab, "Theme")
gpt4o10mat <- extract_data(rand_data$gpt4o_10lab, "Theme")
gpt10mat_one <- extract_data(rand_data$gpt_10_oneshot, "Theme")
claude10mat_one <- extract_data(rand_data$claude_10_oneshot, "Theme")
gpt4o10mat_one <- extract_data(rand_data$gpt4o_10lab_oneshot, "Theme")
gpt10mat_few <- extract_data(rand_data$gpt_10_fewshot, "Theme")
claude10mat_few <- extract_data(rand_data$claude_10_fewshot, "Theme")
gpt4o10mat_few <- extract_data(rand_data$gpt4o_10lab_fewshot, "Theme")

gpt4mat <- extract_data(rand_data$gpt_4lab, "Theme")
claude4mat <- extract_data(rand_data$claude_4lab, "Theme")
gpt4o4mat <- extract_data(rand_data$gpt4o_4lab, "Theme")
gpt4mat_one <- extract_data(rand_data$gpt_4_oneshot, "Theme")
claude4mat_one <- extract_data(rand_data$claude_4_oneshot, "Theme")
gpt4o4mat_one <- extract_data(rand_data$gpt4o_4lab_oneshot, "Theme")
gpt4mat_few <- extract_data(rand_data$gpt_4_fewshot, "Theme")
claude4mat_few <- extract_data(rand_data$claude_4_fewshot, "Theme")
gpt4o4mat_few <- extract_data(rand_data$gpt4o_4lab_fewshot, "Theme")

# Process each dataset and combine results (RAND)
gpt10_values <- get_means(gpt10mat, 10, "GPT 4 Turbo")
claude10_values <- get_means(claude10mat, 10, "Claude Sonnet 3")
gpt4o10_values <- get_means(gpt4o10mat, 10, "GPT 4o")
gpt10_one_values <- get_means(gpt10mat_one, 10, "GPT 4 Turbo (One Shot)")
claude10_one_values <- get_means(claude10mat_one, 10, "Claude Sonnet 3 (One Shot)")
gpt4o10_one_values <- get_means(gpt4o10mat_one, 10, "GPT 4o (One Shot)")
gpt10_few_values <- get_means(gpt10mat_few, 10, "GPT 4 Turbo (Few Shot)")
claude10_few_values <- get_means(claude10mat_few, 10, "Claude Sonnet 3 (Few Shot)")
gpt4o10_few_values <- get_means(gpt4o10mat_few, 10, "GPT 4o (Few Shot)")

gpt4_values <- get_means(gpt4mat, 4, "GPT 4 Turbo")
claude4_values <- get_means(claude4mat, 4, "Claude Sonnet 3")
gpt4o4_values <- get_means(gpt4o4mat, 4, "GPT 4o")
gpt4_one_values <- get_means(gpt4mat_one, 4, "GPT 4 Turbo (One Shot)")
claude4_one_values <- get_means(claude4mat_one, 4, "Claude Sonnet 3 (One Shot)")
gpt4o4_one_values <- get_means(gpt4o4mat_one, 4, "GPT 4o (One Shot)")
gpt4_few_values <- get_means(gpt4mat_few, 4, "GPT 4 Turbo (Few Shot)")
claude4_few_values <- get_means(claude4mat_few, 4, "Claude Sonnet 3 (Few Shot)")
gpt4o4_few_values <- get_means(gpt4o4mat_few, 4, "GPT 4o (Few Shot)")

#OG
gpt10_values_og <- get_means(gpt10mat_og, 10, "GPT 4 Turbo")
claude10_values_og <- get_means(claude10mat_og, 10, "Claude Sonnet 3.5")
gpt4o10_values_og <- get_means(gpt4o10mat_og, 10, "GPT 4o")
gpt10_one_values_og <- get_means(gpt10mat_one_og, 10, "GPT 4 Turbo (One Shot)")
claude10_one_values_og <- get_means(claude10mat_one_og, 10, "Claude Sonnet 3.5 (One Shot)")
gpt4o10_one_values_og <- get_means(gpt4o10mat_one_og, 10, "GPT 4o (One Shot)")
gpt10_few_values_og <- get_means(gpt10mat_few_og, 10, "GPT 4 Turbo (Few Shot)")
claude10_few_values_og <- get_means(claude10mat_few_og, 10, "Claude Sonnet 3.5 (Few Shot)")
gpt4o10_few_values_og <- get_means(gpt4o10mat_few_og, 10, "GPT 4o (Few Shot)")

gpt4_values_og <- get_means(gpt4mat_og, 4, "GPT 4 Turbo")
claude4_values_og <- get_means(claude4mat_og, 4, "Claude Sonnet 3.5")
gpt4o4_values_og <- get_means(gpt4o4mat_og, 4, "GPT 4o")
gpt4_one_values_og <- get_means(gpt4mat_one_og, 4, "GPT 4 Turbo (One Shot)")
claude4_one_values_og <- get_means(claude4mat_one_og, 4, "Claude Sonnet 3.5 (One Shot)")
gpt4o4_one_values_og <- get_means(gpt4o4mat_one_og, 4, "GPT 4o (One Shot)")
gpt4_few_values_og <- get_means(gpt4mat_few_og, 4, "GPT 4 Turbo (Few Shot)")
claude4_few_values_og <- get_means(claude4mat_few_og, 4, "Claude Sonnet 3.5 (Few Shot)")
gpt4o4_few_values_og <- get_means(gpt4o4mat_few_og, 4, "GPT 4o (Few Shot)")

# Combine all the data frames into a single data frame
all_values <- bind_rows(
  gpt10_values, claude10_values, gpt10_few_values, claude10_few_values,
  gpt4_values, claude4_values, gpt4_few_values, claude4_few_values,
  gpt4o10_values, gpt4o10_few_values, gpt4o4_values, gpt4o4_few_values, 
  gpt4_one_values, claude4_one_values, gpt4o4_one_values, gpt10_one_values,
  claude10_one_values, gpt4o10_one_values,
  gpt10_values_og, claude10_values_og, gpt10_few_values_og, claude10_few_values_og,
  gpt4_values_og, claude4_values_og, gpt4_few_values_og, claude4_few_values_og,
  gpt4o10_values_og, gpt4o10_few_values_og, gpt4o4_values_og, gpt4o4_few_values_og, 
  gpt4_one_values_og, claude4_one_values_og, gpt4o4_one_values_og, gpt10_one_values_og,
  claude10_one_values_og, gpt4o10_one_values_og,
  .id = "name"
)

#Identify DF type
all_values$type = if_else(as.numeric(all_values$name) > 18, "OG", "RAND")

# Group by model and labels, then summarize the mean
all_values %>% group_by(model, labels, type) %>% 
  summarize(mean = mean(mean_prop_1s)) %>% arrange(model, labels, type)
```


```{r}
# Function to perform t-test on overall means
perform_overall_mean_ttest <- function(mat1, mat2) {
  mean1 <- colMeans(mat1, na.rm = TRUE)
  mean2 <- colMeans(mat2, na.rm = TRUE)
  t_test_result <- t.test(mean1, mean2)
  return(t_test_result)
}

# Function to perform t-tests on each pair of matching columns
perform_columnwise_ttest <- function(mat1, mat2) {
  results <- list()
  common_cols <- intersect(names(mat1), names(mat2))
  for (col in common_cols) {
    t_test_result <- t.test(mat1[[col]], mat2[[col]], na.rm = TRUE)
    results[[col]] <- t_test_result
  }
  return(results)
}

# Function to extract and test data
extract_and_test_data <- function(rand_data, og_data) {
  #OG Matrices
  gpt10mat_og <- extract_data(og_data$gpt_10lab, "Theme")
  claude10mat_og <- extract_data(og_data$claude_10lab, "Theme")
  gpt4o10mat_og <- extract_data(og_data$gpt4o_10lab, "Theme")
  gpt10mat_one_og <- extract_data(og_data$gpt_10_oneshot, "Theme")
  claude10mat_one_og <- extract_data(og_data$claude_10_oneshot, "Theme")
  gpt4o10mat_one_og <- extract_data(og_data$gpt4o_10lab_oneshot, "Theme")
  gpt10mat_few_og <- extract_data(og_data$gpt_10_fewshot, "Theme")
  claude10mat_few_og <- extract_data(og_data$claude_10_fewshot, "Theme")
  gpt4o10mat_few_og <- extract_data(og_data$gpt4o_10lab_fewshot, "Theme")
  
  gpt4mat_og <- extract_data(og_data$gpt_4lab, "Theme")
  claude4mat_og <- extract_data(og_data$claude_4lab, "Theme")
  gpt4o4mat_og <- extract_data(og_data$gpt4o_4lab, "Theme")
  gpt4mat_one_og <- extract_data(og_data$gpt_4_oneshot, "Theme")
  claude4mat_one_og <- extract_data(og_data$claude_4_oneshot, "Theme")
  gpt4o4mat_one_og <- extract_data(og_data$gpt4o_4lab_oneshot, "Theme")
  gpt4mat_few_og <- extract_data(og_data$gpt_4_fewshot, "Theme")
  claude4mat_few_og <- extract_data(og_data$claude_4_fewshot, "Theme")
  gpt4o4mat_few_og <- extract_data(og_data$gpt4o_4lab_fewshot, "Theme")

  #RAND Matrices
  gpt10mat <- extract_data(rand_data$gpt_10lab, "Theme")
  claude10mat <- extract_data(rand_data$claude_10lab, "Theme")
  gpt4o10mat <- extract_data(rand_data$gpt4o_10lab, "Theme")
  gpt10mat_one <- extract_data(rand_data$gpt_10_oneshot, "Theme")
  claude10mat_one <- extract_data(rand_data$claude_10_oneshot, "Theme")
  gpt4o10mat_one <- extract_data(rand_data$gpt4o_10lab_oneshot, "Theme")
  gpt10mat_few <- extract_data(rand_data$gpt_10_fewshot, "Theme")
  claude10mat_few <- extract_data(rand_data$claude_10_fewshot, "Theme")
  gpt4o10mat_few <- extract_data(rand_data$gpt4o_10lab_fewshot, "Theme")
  
  gpt4mat <- extract_data(rand_data$gpt_4lab, "Theme")
  claude4mat <- extract_data(rand_data$claude_4lab, "Theme")
  gpt4o4mat <- extract_data(rand_data$gpt4o_4lab, "Theme")
  gpt4mat_one <- extract_data(rand_data$gpt_4_oneshot, "Theme")
  claude4mat_one <- extract_data(rand_data$claude_4_oneshot, "Theme")
  gpt4o4mat_one <- extract_data(rand_data$gpt4o_4lab_oneshot, "Theme")
  gpt4mat_few <- extract_data(rand_data$gpt_4_fewshot, "Theme")
  claude4mat_few <- extract_data(rand_data$claude_4_fewshot, "Theme")
  gpt4o4mat_few <- extract_data(rand_data$gpt4o_4lab_fewshot, "Theme")
  
  # Overall t-tests
  overall_tests <- list(
    gpt10 = perform_overall_mean_ttest(gpt10mat, gpt10mat_og),
    claude10 = perform_overall_mean_ttest(claude10mat, claude10mat_og),
    gpt4o10 = perform_overall_mean_ttest(gpt4o10mat, gpt4o10mat_og),
    gpt10_one = perform_overall_mean_ttest(gpt10mat_one, gpt10mat_one_og),
    claude10_one = perform_overall_mean_ttest(claude10mat_one, claude10mat_one_og),
    gpt4o10_one = perform_overall_mean_ttest(gpt4o10mat_one, gpt4o10mat_one_og),
    gpt10_few = perform_overall_mean_ttest(gpt10mat_few, gpt10mat_few_og),
    claude10_few = perform_overall_mean_ttest(claude10mat_few, claude10mat_few_og),
    gpt4o10_few = perform_overall_mean_ttest(gpt4o10mat_few, gpt4o10mat_few_og),
    
    gpt4 = perform_overall_mean_ttest(gpt4mat, gpt4mat_og),
    claude4 = perform_overall_mean_ttest(claude4mat, claude4mat_og),
    gpt4o4 = perform_overall_mean_ttest(gpt4o4mat, gpt4o4mat_og),
    gpt4_one = perform_overall_mean_ttest(gpt4mat_one, gpt4mat_one_og),
    claude4_one = perform_overall_mean_ttest(claude4mat_one, claude4mat_one_og),
    gpt4o4_one = perform_overall_mean_ttest(gpt4o4mat_one, gpt4o4mat_one_og),
    gpt4_few = perform_overall_mean_ttest(gpt4mat_few, gpt4mat_few_og),
    claude4_few = perform_overall_mean_ttest(claude4mat_few, claude4mat_few_og),
    gpt4o4_few = perform_overall_mean_ttest(gpt4o4mat_few, gpt4o4mat_few_og)
  )

  # Column-wise t-tests
  columnwise_tests <- list(
    gpt10 = perform_columnwise_ttest(gpt10mat, gpt10mat_og),
    claude10 = perform_columnwise_ttest(claude10mat, claude10mat_og),
    gpt4o10 = perform_columnwise_ttest(gpt4o10mat, gpt4o10mat_og),
    gpt10_one = perform_columnwise_ttest(gpt10mat_one, gpt10mat_one_og),
    claude10_one = perform_columnwise_ttest(claude10mat_one, claude10mat_one_og),
    gpt4o10_one = perform_columnwise_ttest(gpt4o10mat_one, gpt4o10mat_one_og),
    gpt10_few = perform_columnwise_ttest(gpt10mat_few, gpt10mat_few_og),
    claude10_few = perform_columnwise_ttest(claude10mat_few, claude10mat_few_og),
    gpt4o10_few = perform_columnwise_ttest(gpt4o10mat_few, gpt4o10mat_few_og),
    
    gpt4 = perform_columnwise_ttest(gpt4mat, gpt4mat_og),
    claude4 = perform_columnwise_ttest(claude4mat, claude4mat_og),
    gpt4o4 = perform_columnwise_ttest(gpt4o4mat, gpt4o4mat_og),
    gpt4_one = perform_columnwise_ttest(gpt4mat_one, gpt4mat_one_og),
    claude4_one = perform_columnwise_ttest(claude4mat_one, claude4mat_one_og),
    gpt4o4_one = perform_columnwise_ttest(gpt4o4mat_one, gpt4o4mat_one_og),
    gpt4_few = perform_columnwise_ttest(gpt4mat_few, gpt4mat_few_og),
    claude4_few = perform_columnwise_ttest(claude4mat_few, claude4mat_few_og),
    gpt4o4_few = perform_columnwise_ttest(gpt4o4mat_few, gpt4o4mat_few_og)
  )
  
  list(overall_tests = overall_tests, columnwise_tests = columnwise_tests)
}

# Perform tests
test_results <- extract_and_test_data(rand_data, og_data)
```


```{r}
# Extract means and p-values from the overall_tests
extract_means_pvalues <- function(test_results) {
  overall_tests <- test_results$overall_tests
  data <- tibble(
    Model_Label = names(overall_tests),
    Mean_OG = sapply(overall_tests, function(x) mean(x$estimate[1])),
    Mean_RAND = sapply(overall_tests, function(x) mean(x$estimate[2])),
    P_Value = sapply(overall_tests, function(x) x$p.value)
  )
  return(data)
}

# Extract data from test results
mean_pvalue_data <- extract_means_pvalues(test_results)

# Melt data for ggplot2
mean_pvalue_long <- mean_pvalue_data %>%
  pivot_longer(cols = c(Mean_OG, Mean_RAND), names_to = "Type", values_to = "Mean")

mean_pvalue_long_10 = mean_pvalue_long %>% filter(str_detect(Model_Label, "10"))

mean_pvalue_long_4 = mean_pvalue_long %>% filter(!str_detect(Model_Label, "10"))

#Reorder factors
mean_pvalue_long_10$Model_Label = factor(mean_pvalue_long_10$Model_Label, 
                                      levels = rev(c("gpt10", "gpt10_one", "gpt10_few", "gpt4o10",
                                                 "gpt4o10_one", "gpt4o10_few", "claude10",
                                                 "claude10_one", "claude10_few")), 
                                      labels = rev(c("GPT 4 Turbo (Zero Shot)", 
                                                 "GPT 4 Turbo (One Shot)", 
                                                 "GPT 4 Turbo (Few Shot)", 
                                                 "GPT 4o (Zero Shot)", 
                                                 "GPT 4o (One Shot)", 
                                                 "GPT 4o (Few Shot)",
                                                 "Claude Sonnet 3.5 (Zero Shot)", 
                                                 "Claude Sonnet 3.5 (One Shot)",
                                                 "Claude Sonnet 3.5 (Few Shot)")))

mean_pvalue_long_4$Model_Label = factor(mean_pvalue_long_4$Model_Label, 
                                      levels = rev(c("gpt4", "gpt4_one", "gpt4_few", "gpt4o4",
                                                 "gpt4o4_one", "gpt4o4_few", "claude4",
                                                 "claude4_one", "claude4_few")), 
                                      labels = rev(c("GPT 4 Turbo (Zero Shot)", 
                                                 "GPT 4 Turbo (One Shot)", 
                                                 "GPT 4 Turbo (Few Shot)", 
                                                 "GPT 4o (Zero Shot)", 
                                                 "GPT 4o (One Shot)", 
                                                 "GPT 4o (Few Shot)",
                                                 "Claude Sonnet 3.5 (Zero Shot)", 
                                                 "Claude Sonnet 3.5 (One Shot)",
                                                 "Claude Sonnet 3.5 (Few Shot)")))

#None of the overall tests are significant

# Plot means with custom legend and horizontal bars
tt8 = mean_pvalue_long_10 %>% mutate(Mean = Mean*100) %>%
  ggplot(aes(x = Model_Label, y = Mean, fill = Type)) +
  theme_minimal()  +
  geom_bar(stat = "identity", position = "dodge") +
  coord_flip() +
  scale_fill_manual(values = c("Mean_OG" = "grey30", "Mean_RAND" = "#B0B0B0"),
                    labels = c("Mean_OG" = "Fixed", "Mean_RAND" = "Random")) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), 
        panel.grid.minor = element_blank()) + 
  ylim(c(0, 35)) +
  labs(title = "8 Label Specification",
       x = NULL,
       y = "% Coded as 1s",
       fill = "Label Order") +
  guides(fill = guide_legend(reverse = TRUE)) 

tt8

tt4 = mean_pvalue_long_4 %>% mutate(Mean = Mean*100) %>%
  ggplot(aes(x = Model_Label, y = Mean, fill = Type)) +
  theme_minimal()  +
  geom_bar(stat = "identity", position = "dodge") +
  coord_flip() +
  scale_fill_manual(values = c("Mean_OG" = "grey30", "Mean_RAND" = "#B0B0B0"),
                    labels = c("Mean_OG" = "Fixed", "Mean_RAND" = "Random")) + ylim(c(0, 35)) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), 
        panel.grid.minor = element_blank()) +
  labs(title = "4 Label Specification",
       x = NULL,
       y = "% Coded as 1s",
       fill = "Label Order") +
  guides(fill = guide_legend(reverse = TRUE)) 

tt4

# Combine plots and legend
cav <- (tt8 + tt4) + plot_layout(guides = "collect") +
  theme(legend.position = "right")

print(cav)

#ggsave("diff_random_means.png", cav, width = 10, height = 5)
```


```{r plot column means}
# Function to extract column-wise t-test results into a data frame
extract_columnwise_ttest_results <- function(columnwise_tests) {
  results <- tibble(
    Model_Label = character(),
    Column = character(),
    Mean_OG = numeric(),
    Mean_RAND = numeric(),
    P_Value = numeric()
  )
  
  for (model in names(columnwise_tests)) {
    model_tests <- columnwise_tests[[model]]
    for (column in names(model_tests)) {
      test_result <- model_tests[[column]]
      mean_og <- mean(test_result$estimate[1], na.rm = TRUE)
      mean_rand <- mean(test_result$estimate[2], na.rm = TRUE)
      results <- results %>% add_row(
        Model_Label = model,
        Column = column,
        Mean_OG = mean_og,
        Mean_RAND = mean_rand,
        P_Value = test_result$p.value
      )
    }
  }
  
  return(results)
}

# Perform tests
test_results <- extract_and_test_data(rand_data, og_data)

# Extract column-wise t-test results
columnwise_results <- extract_columnwise_ttest_results(test_results$columnwise_tests)

# Display the results
print(columnwise_results)

# Plot the results
columnwise_long <- columnwise_results %>%
  pivot_longer(cols = c(Mean_OG, Mean_RAND), names_to = "Type", values_to = "Mean")

columnwise_long_10 = columnwise_long %>% filter(str_detect(Model_Label, "10")) %>% 
  mutate(sig = if_else(P_Value <= 0.05, 1, 0))

columnwise_long_4 = columnwise_long %>% filter(!str_detect(Model_Label, "10")) %>% 
  mutate(sig = if_else(P_Value <= 0.05, 1, 0))

```


```{r}

```

